Genome-resolved metagenomics on anaerobic enrichments from deep subsurface groundwaters

Author

george.westmeijer@umu.se

Published

November 29, 2024

Read data and assign colors

The 16S ribosomal RNA data is from several sequencing projects. In chronological order, P15009 contains the groundwater samples from t0 (sampled Oct 2019), while P21015 contains the samples from t24 (sampled Feb 2020) from identical groundwaters. P21015 contains the sequencing libraries from the cultures as part of phase a (one meteoric groundwater), while P26201 contains the libraries from the cultures as part of phase b (three groundwaters as inoculum).

All 16S libraries were processed with the bioinformatic pipeline nf-core/ampliseq (v2.12.0), using the SBDI-GTDB (R09-RS220-1) as a reference.

Expand code
seqtab <- read_tsv("data/ASV_table.tsv.gz", col_types = cols(.default = col_character(), 
                                    count = col_integer()
                                    )) %>%
  group_by(sample) %>% mutate(relab = count / sum(count)) %>% ungroup()

tax <- read_tsv("data/ASV_tax.tsv.gz", col_types = cols(.default = col_character()))
               
smd <- read_tsv("data/smd.tsv", col_types = cols(.default = col_character())) 

This project includes 15 metagenomes from 15 unique cultures. The metagenomes contain 35 de-replicated metagenome-assembled genomes (MAG) that are affiliated with the Bacillota (7), Desulfobacterota (7), Pseudomonadota (6), Bacteroidota (4), Patescibacteria (3), Nanoarchaeota (2), Acidobacteriota (1), Campylobacterota (1), Chloroflexota (1), Iainarchaeota (1), Spirochaetota (1), and Verrucomicrobiota (1).

Figure 1: sampling-site
Expand code
coverm <- data.frame() # Soon will contain 35 de-replicated genomes

for (file in list.files("data/coverm", full.names = TRUE)) {
  name <- substr(file, 13, nchar(file)) %>% gsub("_L002_1.trim.fastq.gz", "", .)
  i <- read_tsv(file, show_col_types = F) %>% rename(genome = 1, tpm = 2)
  i <- i %>% mutate(sample = name) %>% filter(tpm > 0) %>% select(genome, sample, tpm)
  coverm <- coverm %>% rbind(i)
}

emapper <- data.frame()

for (file in list.files("data/emapper", full.names = TRUE)) { # List all files in the emapper directory
  name <- substr(file, 14, nchar(file)) %>% gsub(".emapper.annotations", "", .) # Extract the bin ref from the filename
  i <- read_tsv(file, show_col_types = F, comment = "##") # Read the file
  i <- i %>% rename(query = 1) %>% mutate(genome = name) %>% distinct() %>% select(genome, everything()) %>% mutate(KEGG_ko = gsub("ko:", "", KEGG_ko))
  emapper <- emapper %>% rbind(i)
}

files <- list.files("data/prokka", full.names = TRUE, pattern = "tsv")
prokka <- data.frame()
for (file in files) {
  name <- gsub("data/prokka/", "", file) %>% gsub(".tsv", "", .)
  i <- read_tsv(file, show_col_types = F, comment = "##") %>%
    mutate(genome = name) %>% distinct() %>% 
    select(genome, everything()) %>% 
    select(-locus_tag) %>% 
    rename(ec = EC_number)
  prokka <- prokka %>% rbind(i)
}

# Sample 35 (l7f) was removed from the binning process as this sample did not produce any bins
bintax <- read_tsv("data/gtdbtk.summary.tsv", col_types = cols(.default = col_character())) 

quality <- read_tsv("data/quality_report.tsv", show_col_types = F) %>%
  rename_with(tolower) %>%
  rename(genome = name) %>%
  filter(genome %in% coverm$genome)
Expand code
cols.gw <- c(
  "KR0015" = "#A2A475",
  "SA1420" = "#899DA4",
  "SA2600" = "#C7B19C"
  )

cols.origin <- c(
  "meteoric" = "#A2A475",
  "marine" = "#899DA4",
  "saline" = "#C7B19C"
  )

cols.fraction <- c(
  "environmental" = "#8D8680",
  "fractionated" = "#A2A475",
  "non-fractionated" = "#C7B19C"
  )

cols.mag <- c(
  "Other" = "#899DA4",
  "Nanoarchaeota" = "#046C9A",
  "Patescibacteria" = "#C7B19C",
  "Pseudomonadota" = "#972D15",
  "Desulfobacterota" = "#A2A475",
  "Bacillota" = "#1B5331"
  )

cols.phylum <- c( # Sorted on abundance
  "Patescibacteria" = "#C7B19C",
  "Desulfobacterota" = "#A2A475",
  "Chloroflexota" = "#D69C4E",
  "Atribacterota" = "#972D15",
  "Acidobacteriota" = "#FAEFD1",
  "Omnitrophota" = "#00A08A",
  "Nitrospirota" = "#D8B70A",
  "Campylobacterota" = "#1B5331",
  "Other" = "#899DA4",
  "Unidentified" = "#046C9A"
)

16S data groundwaters

Rarefaction 16S data

The 16S data from the groundwater samples are rarefied to check if there is sufficient sequencing depth to estimate microbial diversity.

Expand code
t0 <- c(
  "P15009_1057","P15009_1059","P15009_1060","P21015_1020","P21015_1021","P21015_1022", # KR0015
  "P15009_1061","P15009_1065","P15009_1066","P21015_1023","P21015_1024","P21015_1025", # SA1420
  "P15009_1069","P15009_1062","P15009_1063","P21015_1029","P21015_1030","P21015_1031"  # SA2600
       )

# Perform the subsampling
rare <- seqtab %>%
  # Remove negative control
  filter(sample %in% t0) %>%
  select(-relab) %>%
  pivot_wider(names_from = "seqid", values_from = "count", values_fill = 0) %>%
  column_to_rownames("sample") %>%
  vegan::rarecurve(sample = 10000, step = 100, tidy = TRUE) %>%
  tibble() %>%
  rename_with(tolower)
Expand code
rare %>%
  inner_join(smd, by = c("site" = "sample")) %>%
  # One sample has ~ 1 million reads 
  filter(sample < 150000) %>%
  ggplot(aes(sample, species, group = site, colour = origin)) + 
  geom_line(linewidth = 0.75) +
  scale_color_manual(values = cols.origin, name = "Groundwater", labels = c("Meteoric", "Marine", "Saline")) +
  labs(x = "No. of sequenced reads", y = "Rarefied No. of ASVs") +
  scale_x_continuous(labels = c("0", "50.000", "100.000", "150.000")) +
  theme_tidy()
Figure 2: Rarefaction curves for the groundwater samples (t0) for each groundwater type (n = 6).

Overview community structure t0

To visualize the community composition, the most abundant phyla are identified while grouping less abundant phyla as “Other”. ASVs that are not identified on phylum level are included as “Unidentified”.

Expand code
t <- seqtab %>%
  filter(sample %in% t0) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  filter(!is.na(phylum)) %>%
  top_n(8, mean_relab)

t %>% arrange(desc(mean_relab)) %>% knitr::kable()
phylum mean_relab
Patescibacteria 7.1312325
Desulfobacterota 2.7429408
Chloroflexota 1.2292858
Atribacterota 1.2125307
Acidobacteriota 0.9182271
Omnitrophota 0.7237157
Nitrospirota 0.3144247
Campylobacterota 0.2617040
Expand code
taxref <- tax %>%
  left_join(t %>% transmute(phylum, topphylum = phylum), by = "phylum") %>%
  mutate(topphylum = if_else(is.na(phylum), "Unidentified", topphylum)) %>%
  replace_na(list("topphylum" = "Other"))
Expand code
p3 <- seqtab %>%
  filter(sample %in% t0) %>%
  inner_join(taxref, by = "seqid") %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, topphylum) %>% 
  summarise(mrelab = sum(relab), .groups = 'drop') %>%
  # Call the plot
  ggplot(aes(x = fct_relevel(sample, t0), 
             y = mrelab * 100, 
             fill = fct_relevel(topphylum, names(cols.phylum) %>% rev())
             )
         ) +
  labs(x = '', y = 'Relative abundance (%)', fill = "") +
  geom_col() + scale_y_continuous(expand = c(0.01,0), limits = c(-10,101)) +
  annotate(geom = "segment", x =  0.55, xend =  3.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  3.55, xend =  6.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  6.55, xend =  9.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x =  9.55, xend = 12.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x = 12.55, xend = 15.45, y = -1, yend = -1, linewidth = 0.5) +
  annotate(geom = "segment", x = 15.55, xend = 18.45, y = -1, yend = -1, linewidth = 0.5) +
  # Labels dates
  annotate(geom = "text", x =  2, y = -3.5, label = "Oct 2019", size = 7/.pt) +
  annotate(geom = "text", x =  5, y = -3.5, label = "April 2020", size = 7/.pt) +
  annotate(geom = "text", x =  8, y = -3.5, label = "Oct 2019", size = 7/.pt) +
  annotate(geom = "text", x = 11, y = -3.5, label = "April 2020", size = 7/.pt) +
  annotate(geom = "text", x = 14, y = -3.5, label = "Oct 2019", size = 7/.pt) +
  annotate(geom = "text", x = 17, y = -3.5, label = "April 2020", size = 7/.pt) +
  # Labels groundwater
  annotate(geom = "text", x = 3.5, y = -8, label = "Meteoric", size = 7/.pt) +
  annotate(geom = "text", x = 9.5, y = -8, label = "Marine", size = 7/.pt) +
  annotate(geom = "text", x =15.5, y = -8, label = "Saline", size = 7/.pt) +
  scale_fill_manual(values = cols.phylum) + guides(fill = guide_legend(ncol = 3, reverse = TRUE)) +
  theme_tidy(ratio = 0.75, legend = "bottom") + 
  theme(
    axis.text.x = element_blank(), 
    axis.ticks.x = element_blank(),
    legend.key.height = unit(3.5, "mm"),
    legend.margin = margin(t = -20),
    legend.text = element_text(margin = margin(l = 2)),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.key.spacing.y = unit(0, "mm")
    )

Redundancy analysis

Expand code
site <- magick::image_read("figures/äspö-mod.png")

p1 <- ggplot() + 
  annotation_custom(grid::rasterGrob(site)) + 
  theme_minimal() + coord_fixed() +
  theme(
    plot.margin = margin(), 
    aspect.ratio = 0.75
    )

As the hydrochemistry data is from a monitoring program, the sampling data closest to groundwater sampling was selected. All units are mg l-1, except for EC (mS m-1), pH, T (degrees C), and O18 (ppt SMOW)

Expand code
chem <- read_tsv("data/sicada.txt", show_col_types = FALSE) %>% 
  rename_with(tolower) %>% 
  filter(sampling) %>%
  select(groundwater = idcode, so4, feii, ph_lab, ec_lab, temp_w, doc, nh4_n, o18, s34_so4) %>%
  inner_join(smd[,c("sample","groundwater")], by = "groundwater")

The number of ASVs per sample are estimated with the specnumber() function from the Vegan library.

Expand code
adiv <- seqtab %>%
  select(-relab) %>%
  pivot_wider(names_from = "seqid", values_from = "count", values_fill = 0) %>%
  column_to_rownames('sample') %>%
  vegan::specnumber() %>% as.data.frame() %>%
  rownames_to_column('sample') %>% filter(sample %in% t0) %>%
  rename(specnumber = 2)

chem <- adiv %>% # Add the species richness for each sample
  inner_join(chem, by = "sample") 

In order to run the RDA, the absolute counts are standardized using a Hellinger transformation (square root of the relative abundance).

Expand code
hellinger <- seqtab %>%
  filter(sample %in% t0) %>%
  # Standard Vegan transformation: Turn table with with samples as *rows*
  select(sample, seqid, count) %>%
  pivot_wider(names_from = "seqid", values_from = "count", values_fill = 0) %>%
  # Turn into a numeric matrix
  column_to_rownames('sample') %>% vegan::decostand(method = "hellinger") %>%
  rownames_to_column("sample") %>%
  pivot_longer(-1, names_to = "seqid", values_to = "hellinger")

m <- hellinger %>% # Create a matrix
  spread(seqid, hellinger) %>%
  column_to_rownames("sample")

The RDA is called while providing a large number of constraining factors, as the RDA function will by default not include redundant variables (Pearson r > 0.9).

Expand code
set.seed(999)

vegan::rda(
  m ~ o18 + doc + ph_lab + ec_lab + temp_w + nh4_n + feii, 
  correlation = T,
  data = chem %>%
    semi_join(chem, by = 'sample') %>%
    filter(sample %in% t0) %>%
    arrange(sample) %>%
    column_to_rownames('sample') 
) -> rda

Some constraints or conditions were aliased because they were redundant. This
can happen if terms are linearly dependent (collinear): 'ph_lab', 'ec_lab',
'temp_w', 'nh4_n', 'feii'
Expand code
# This one will contain the proportions explained which we get by dividing the
# eigenvalue of each component with the sum of eigenvalues for all components.
p <- c(rda$CCA$eig, rda$CA$eig)/sum(c(rda$CCA$eig, rda$CA$eig))
# This one is collecting the coordinates for arrows that will depict how the 
# factors in our model point in the coordinate
a <- rda$CCA$biplot %>% data.frame() %>% tibble::rownames_to_column('factor')

lab <- tibble(factor = c("o18", "doc"),
            label = c("delta^18*O", "DOC")
) %>% inner_join(a, by = "factor")

summary(rda)

Call:
rda(formula = m ~ o18 + doc + ph_lab + ec_lab + temp_w + nh4_n +      feii, data = chem %>% semi_join(chem, by = "sample") %>%      filter(sample %in% t0) %>% arrange(sample) %>% column_to_rownames("sample"),      correlation = T) 

Partitioning of variance:
              Inertia Proportion
Total          0.6761     1.0000
Constrained    0.3871     0.5726
Unconstrained  0.2890     0.4274

Eigenvalues, and their contribution to the variance 

Importance of components:
                        RDA1   RDA2     PC1     PC2     PC3     PC4     PC5
Eigenvalue            0.2338 0.1533 0.05444 0.04926 0.04333 0.02921 0.02274
Proportion Explained  0.3458 0.2267 0.08052 0.07286 0.06409 0.04321 0.03363
Cumulative Proportion 0.3458 0.5726 0.65309 0.72595 0.79004 0.83325 0.86688
                          PC6     PC7     PC8      PC9     PC10    PC11
Eigenvalue            0.01598 0.01337 0.01223 0.009816 0.008646 0.00766
Proportion Explained  0.02363 0.01978 0.01808 0.014520 0.012789 0.01133
Cumulative Proportion 0.89052 0.91029 0.92838 0.942898 0.955687 0.96702
                          PC12     PC13     PC14     PC15
Eigenvalue            0.007136 0.006260 0.004717 0.004187
Proportion Explained  0.010555 0.009259 0.006977 0.006193
Cumulative Proportion 0.977571 0.986830 0.993807 1.000000

Accumulated constrained eigenvalues
Importance of components:
                        RDA1   RDA2
Eigenvalue            0.2338 0.1533
Proportion Explained  0.6040 0.3960
Cumulative Proportion 0.6040 1.0000
Expand code
vegan::RsquareAdj(rda)
$r.squared
[1] 0.5725727

$adj.r.squared
[1] 0.5155824
Expand code
p2 <- rda$CCA$wa %>% data.frame() %>% tibble::rownames_to_column('sample') %>%
  inner_join(chem, by = 'sample') %>%
  ggplot(aes(x = RDA1, y = RDA2)) +
  geom_vline(xintercept = 0, colour = "grey", linetype = "dotted") +
  geom_hline(yintercept = 0, colour = "grey", linetype = "dotted") +
  geom_point(aes(colour = groundwater, size = specnumber), shape = 1, stroke = 0.5) +
  xlab(sprintf("RDA1 (%2.1f%% explained)", p[[1]] * 100)) +
  ylab(sprintf("RDA2 (%2.1f%% explained)", p[[2]] * 100)) +
  geom_segment(
    data = a, mapping = aes(xend = RDA1/5, yend = RDA2/5), linewidth = 0.3,
    x = 0, y = 0, arrow = arrow(length = unit(1.5, "mm"), type = "closed") 
    ) +
  geom_label(
    data = lab, mapping = aes(x = RDA1/10, y = RDA2/10, label = label),
    size = 7/.pt, parse = TRUE, label.size = 0, label.padding = unit(0.1, "lines")
    ) + 
  scale_size(range = c(1,8), name = "No. of ASVs") +
  annotate("text", x = -0.07, y =-0.3, label = "Meteoric", colour = cols.gw[1], size = 7/.pt) +
  annotate("text", x = 0.06, y = 0.28, label = "Marine", colour = cols.gw[2], size = 7/.pt) +
  annotate("text", x =  0.2, y = -0.05, label = "Saline", colour = cols.gw[3], size = 7/.pt) +
  stat_ellipse(aes(x = RDA1, y = RDA2, colour = groundwater), 
               geom = "polygon", show.legend = F,
               fill = NA, linetype = "dashed") +
  # Add R-squared
  annotate("text", x = Inf, y = -Inf, 
           label = "paste(R ^ 2, \" = 0.52\")", 
           size = 7/.pt, vjust = -0.5, hjust = 1.05, parse = TRUE) +
  scale_color_manual(name = "",values = cols.gw, guide = "none") +
  scale_fill_manual(name = "",values = cols.gw, guide = "none") +
  theme_tidy() +
  theme(
    legend.background = element_rect(fill = NA),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    legend.text = element_text(margin = margin(r = 12, unit = "pt")),
    legend.position = c(0.9,0.85)
    )
Warning: A numeric `legend.position` argument in `theme()` was deprecated in ggplot2
3.5.0.
ℹ Please use the `legend.position.inside` argument of `theme()` instead.
Expand code
library(patchwork)

p2 + p3 & theme(plot.margin = margin(l = 4))

Expand code
ggsave("figures/fig-1.pdf", width = 18, height = 10, units = "cm")

Network analysis: phylum

Expand code
t <- c(
  "Spirochaetota", "Acidobacteriota", "Atribacterota", "Chloroflexota",
  "Omnitrophota", "Nanoarchaeota", "Campylobacterota", "Pseudomonadota"
)

network.survey <- read_tsv("data/network-survey.tsv", show_col_types = F)

# Correlation between the abundance of the phyla using the cultures
fit <- tibble("phylum" = t, "r.squared" = NA)
for (i in t) {
  model <- lm(relab ~ cpr, data = network.survey %>% filter(phylum == i)) %>% summary()
  fit[fit$phylum == i, "r.squared"] <- model$adj.r.squared %>% round(digits = 2)
}

fit
# A tibble: 8 × 2
  phylum           r.squared
  <chr>                <dbl>
1 Spirochaetota         0.57
2 Acidobacteriota       0.55
3 Atribacterota         0.51
4 Chloroflexota         0.14
5 Omnitrophota          0.14
6 Nanoarchaeota         0.08
7 Campylobacterota      0.05
8 Pseudomonadota       -0.01
Expand code
p1 <- network.survey %>%
  mutate(phylum = factor(phylum, levels = fit$phylum)) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_text(data = fit %>% mutate(phylum = factor(phylum, levels = phylum)), 
            aes(x = Inf, y = -Inf, label = paste("R^{2}:", r.squared)), 
            size = 7/.pt, hjust = 1.1, vjust = -0.3, parse = TRUE) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "", y = "Sqrt relative abundance (%)") +
  theme_tidy() +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~phylum, scales = "free", ncol = 4) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )
Expand code
t <- c(
  "Omnitrophota", "Chloroflexota", "Pseudomonadota", "Actinomycetota",
  "Verrucomicrobiota", "Acidobacteriota", "Bacillota", "Spirochaetota"
)

network <- seqtab %>%
  inner_join(smd, by = "sample") %>% 
  filter(fraction == "non-fractionated") %>%
  inner_join(tax, by = "seqid") %>% filter(phylum %in% t) %>%
  group_by(sample, phylum) %>% summarise(relab = sum(relab), .groups = "drop") 

network <- seqtab %>% 
  inner_join(tax, by = "seqid") %>% filter(phylum == "Patescibacteria") %>%
  group_by(sample, phylum) %>% summarise(cpr = sum(relab), .groups = "drop") %>% select(-phylum) %>%
  inner_join(network, by = "sample")

# Correlation between the abundance of the phyla using the cultures
fit <- tibble("phylum" = t, "r.squared" = NA)
for (i in t) {
  model <- lm(relab ~ cpr, data = network %>% filter(phylum == i)) %>% summary()
  fit[fit$phylum == i, "r.squared"] <- model$adj.r.squared %>% round(digits = 2)
}

fit
# A tibble: 8 × 2
  phylum            r.squared
  <chr>                 <dbl>
1 Omnitrophota           0.87
2 Chloroflexota          0.64
3 Pseudomonadota         0.04
4 Actinomycetota         0.01
5 Verrucomicrobiota     -0.01
6 Acidobacteriota       -0.01
7 Bacillota             -0.01
8 Spirochaetota         -0.01
Expand code
p2 <- network %>%
  mutate(phylum = factor(phylum, levels = fit$phylum)) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_text(data = fit %>% mutate(phylum = factor(phylum, levels = phylum)), 
            aes(x = Inf, y = -Inf, label = paste("R^{2}:", r.squared)), 
            size = 7/.pt, hjust = 1.1, vjust = -0.3, parse = TRUE) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "", y = "Sqrt relative abundance (%)") +
  theme_tidy() +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~phylum, scales = "free", ncol = 4) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )
Expand code
library(patchwork)
p1 / p2 + plot_annotation(tag_levels = c("a", "b")) & theme(
  panel.border = element_rect(fill = "transparent", linewidth = 0.5),
  axis.ticks.length = unit(0.5, "mm"),
  plot.margin = margin(),
  plot.tag = element_text(size = 7),
  plot.tag.position = c(0.035, 0.96)
)

Expand code
ggsave("figures/network-phylum.pdf", width = 14, height = 16, units = "cm")

Network analysis: order

Expand code
# The file survey-amplicons.tsv contains the ASVs from Äspö groundwaters published in https://doi.org/10.3389/fmicb.2018.02880
survey.amplicons <- read_tsv("data/survey-amplicons.tsv", show_col_types = FALSE) %>%
  group_by(sample) %>% mutate(relab = count / sum(count)) %>% ungroup()

cpr <- survey.amplicons %>%
  filter(phylum == "Patescibacteria") %>%
  group_by(sample, order) %>% summarise(cpr = sum(relab), .groups = "drop") 

network.survey <- survey.amplicons %>%
  filter(phylum %in% t) %>%
  group_by(sample, order) %>% summarise(relab = sum(relab), .groups = "drop") %>% 
  # Add colum with abundance of CPR
  left_join(cpr, by = "sample", relationship = "many-to-many") %>%
  rename(order.clade = order.x, order.cpr = order.y)

# The filter > 19 implies that the two orders have to co-occur at least ten times
fit <- network.survey %>%
  group_by(order.clade, order.cpr) %>% summarize(n = n(), .groups = "drop_last") %>% 
  filter(n > 19) %>% ungroup() %>% na.omit()

for (i in 1:nrow(fit)) {
  data = network.survey %>% filter(order.clade == fit[i,]$order.clade & order.cpr == fit[i,]$order.cpr)
  model <- lm(relab ~ cpr, data = data)
  fit[i, "r.squared"] <- summary(model)$r.squared
}

fit <- fit %>% slice_max(r.squared, n = 10) %>%
  mutate(clade = paste(order.clade, order.cpr))
Expand code
p1 <- network.survey %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  filter(clade %in% fit$clade) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  #geom_text(data = fit %>% mutate(phylum = factor(phylum, levels = phylum)), 
  #          aes(x = Inf, y = -Inf, label = r.squared), 
  #          size = 7/.pt, hjust = 1.1, vjust = -0.3, parse = TRUE) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "", y = "Sqrt relative abundance (%)") +
  theme_tidy() +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~order.clade + order.cpr, scales = "free", ncol = 4) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )

p1

Expand code
t <- c(
  "Omnitrophota", "Chloroflexota", "Pseudomonadota", "Actinobacteriota",
  "Verrucomicrobiota", "Acidobacteriota", "Bacillota", "Spirochaetota"
)

cpr <- seqtab %>%
  inner_join(smd, by = "sample") %>% 
  filter(fraction == "All cells") %>%
  inner_join(tax, by = "seqid") %>% filter(phylum == "Patescibacteria") %>%
  group_by(sample, order) %>% summarise(cpr = sum(relab), .groups = "drop") 

network <- seqtab %>%
  inner_join(smd, by = "sample") %>% 
  filter(fraction == "All cells") %>%
  inner_join(tax, by = "seqid") %>% filter(phylum %in% t) %>%
  group_by(sample, order) %>% summarise(relab = sum(relab), .groups = "drop") %>% 
  # Add colum with abundance of CPR
  left_join(cpr, by = "sample", relationship = "many-to-many") %>%
  rename(order.clade = order.x, order.cpr = order.y)

# The filter > 19 implies that the two orders have to co-occur at least ten times
fit <- network %>%
  group_by(order.clade, order.cpr) %>% summarize(n = n(), .groups = "drop_last") %>% 
  filter(n > 19) %>% ungroup() %>% na.omit()

for (i in 1:nrow(fit)) {
  data = network %>% filter(order.clade == fit[i,]$order.clade & order.cpr == fit[i,]$order.cpr)
  model <- lm(relab ~ cpr, data = data)
  fit[i, "r.squared"] <- summary(model)$adj.r.squared
}
fit <- fit %>% slice_max(r.squared, n = 12) %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  mutate(r.squared = signif(.$r.squared, digits = 2)) %>%
  arrange(desc(r.squared))
Expand code
p2 <- network %>%
  mutate(clade = paste(order.clade, order.cpr)) %>%
  filter(clade %in% fit$clade) %>%
  mutate(clade = factor(clade, levels = fit$clade)) %>%
  ggplot(aes(
    x = sqrt(cpr * 100),
    y = sqrt(relab * 100)
  )) +
  geom_point(shape = 21, stroke = 0.75, fill = "white") +
  geom_text(data = fit %>% mutate(clade = factor(clade, levels = clade)), 
            aes(x = Inf, y = -Inf, label = paste("R^{2}: ", r.squared)), 
            size = 7/.pt, hjust = 1.1, vjust = -0.3, parse = TRUE) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#D9D9D9", linewidth = 0.5) + 
  labs(x = "", y = "Sqrt relative abundance (%)") +
  theme_tidy() +
  scale_y_continuous(labels = function(x) sprintf("%.1f", x)) +
  facet_wrap(~clade, scales = "free", ncol = 4) +
  theme(
    strip.background = element_rect(fill = "#F0F0F0")
  )

p2

16S data cultures

NMDS cultures three groundwaters (supplemental)

Expand code
set.seed(999)

nmds <- seqtab %>% # Full size fraction
  inner_join(smd, by = "sample") %>%
  filter(!sample %in% paste("P21015_10", seq(42,81), sep = "")) %>% # Data from bachelor project
  filter(fraction == "non-fractionated" | fraction == "environmental") %>%
  group_by(seqid) %>% filter(count > 2) %>% ungroup() %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>% vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.211602 
Run 1 stress 0.2345592 
Run 2 stress 0.2112576 
... New best solution
... Procrustes: rmse 0.008926961  max resid 0.03646891 
Run 3 stress 0.212245 
Run 4 stress 0.2286573 
Run 5 stress 0.211117 
... New best solution
... Procrustes: rmse 0.00353371  max resid 0.02077429 
Run 6 stress 0.2128131 
Run 7 stress 0.2291371 
Run 8 stress 0.211117 
... New best solution
... Procrustes: rmse 1.543506e-05  max resid 8.833604e-05 
... Similar to previous best
Run 9 stress 0.4051623 
Run 10 stress 0.2423527 
Run 11 stress 0.2308933 
Run 12 stress 0.212245 
Run 13 stress 0.2461051 
Run 14 stress 0.2462855 
Run 15 stress 0.2210886 
Run 16 stress 0.2335761 
Run 17 stress 0.2339457 
Run 18 stress 0.211117 
... Procrustes: rmse 1.554376e-05  max resid 6.999867e-05 
... Similar to previous best
Run 19 stress 0.2291882 
Run 20 stress 0.2318306 
*** Best solution repeated 2 times
Expand code
vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample") -> nmds.scores

Plot the ordination

Expand code
p1 <- nmds.scores %>%
  mutate(medium = if_else(medium == "env","Environmental","Culture")) %>%
  # Plot
  ggplot(aes(x = NMDS1, y = NMDS2
             )) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  # Points for samples, coloured by origin
  geom_point(aes(fill = origin,
                 colour = origin,
                 shape = medium,
                 ), size = 3, stroke = 0.5) +
  theme_tidy() +
  scale_shape_manual(values = c("Culture" = 21, "Environmental" = 23), name = "") +
  scale_fill_manual(values = alpha(cols.origin, alpha = 0.5), guide = "none") +
  scale_colour_manual(values = cols.origin, guide = "none") +
  annotate('text', x = -Inf, y = -Inf, size = 7/.pt, 
           label = paste('Stress = ', round(nmds$stress, digits = 2)),
           hjust = -0.1, vjust = -1,
           ) +
  ggtitle("All cells") +
  theme(
    plot.title = element_text(hjust = .5, size = 7)
    )
Expand code
nmds <- seqtab %>% # Small size fraction
  inner_join(smd, by = "sample") %>%
  filter(!sample %in% paste("P21015_10", seq(42,81), sep = "")) %>% # Data from bachelor project
  filter(fraction == "0.1 - 0.45 µm" | fraction == "Environmental") %>%
  group_by(seqid) %>% filter(count > 2) %>% ungroup() %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>% vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.2081801 
Run 1 stress 0.1495638 
... New best solution
... Procrustes: rmse 0.137786  max resid 0.4403868 
Run 2 stress 0.1495638 
... New best solution
... Procrustes: rmse 1.671599e-05  max resid 4.648518e-05 
... Similar to previous best
Run 3 stress 0.1495638 
... Procrustes: rmse 7.217654e-05  max resid 0.000230497 
... Similar to previous best
Run 4 stress 0.197663 
Run 5 stress 0.1858236 
Run 6 stress 0.1495638 
... Procrustes: rmse 5.761408e-05  max resid 0.0001885813 
... Similar to previous best
Run 7 stress 0.1696327 
Run 8 stress 0.1933782 
Run 9 stress 0.1696589 
Run 10 stress 0.1495638 
... New best solution
... Procrustes: rmse 1.230415e-05  max resid 3.972137e-05 
... Similar to previous best
Run 11 stress 0.1495638 
... Procrustes: rmse 1.771555e-05  max resid 6.448628e-05 
... Similar to previous best
Run 12 stress 0.1496035 
... Procrustes: rmse 0.002179419  max resid 0.009601999 
... Similar to previous best
Run 13 stress 0.1496035 
... Procrustes: rmse 0.002179319  max resid 0.009588255 
... Similar to previous best
Run 14 stress 0.1858235 
Run 15 stress 0.1696587 
Run 16 stress 0.1496035 
... Procrustes: rmse 0.002166634  max resid 0.009522663 
... Similar to previous best
Run 17 stress 0.1495638 
... Procrustes: rmse 2.06804e-05  max resid 7.56214e-05 
... Similar to previous best
Run 18 stress 0.1864292 
Run 19 stress 0.1495638 
... Procrustes: rmse 1.367408e-05  max resid 4.399976e-05 
... Similar to previous best
Run 20 stress 0.1495638 
... Procrustes: rmse 5.688851e-05  max resid 0.0001883812 
... Similar to previous best
*** Best solution repeated 8 times
Expand code
vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample") -> nmds.scores
Expand code
p2 <- nmds.scores %>%
  mutate(medium = if_else(medium == "env","Environmental","Culture")) %>%
  # Plot
  ggplot(aes(x = NMDS1, y = NMDS2)) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  # Points for samples, coloured by nature
  geom_point(aes(
    colour = origin,
    fill = origin, 
    shape = medium), size = 3, stroke = 0.5) +
  theme_tidy() +
  scale_shape_manual(values = c("Culture" = 21, "Environmental" = 23), name = "") +
  scale_fill_manual(values = alpha(cols.origin, alpha = 0.5), guide = "none") +
  scale_colour_manual(values = cols.origin, name = "Groundwater", labels = c("Meteoric", "Marine", "Saline")) +
  annotate('text', x = -Inf, y = -Inf, size = 7/.pt, 
           label = paste('Stress = ', round(nmds$stress, digits = 2)),
           hjust = -.1, vjust = -1,
           ) +
  ggtitle("0.1 - 0.45 µm fraction") +
  theme(plot.title = element_text(hjust = .5, size = 7),
        axis.title.y = element_blank())
Expand code
library(patchwork)
p1 + p2 + plot_layout(guides = "collect") & 
  theme(
    legend.position = "right",
    plot.background = element_blank(),
    legend.text = element_text(margin = margin()),
    legend.box.margin = margin(l = -10),
    plot.margin = margin(l = 4)
    )

Expand code
ggsave("figures/fig-s1.png", height = 9, width = 18, units = "cm")

Diversity (alpha + beta) bachelor project (P21015)

Estimate alpha diversity using the Shannon-Weaver index on species level

Expand code
adiv <- seqtab %>%
  select(-relab) %>%
  spread(seqid, count, fill = 0) %>% 
  column_to_rownames('sample') %>%
  vegan::diversity() %>% as.data.frame() %>%
  rownames_to_column('sample') %>%
  rename(shannon = 2) %>%
  inner_join(smd, by = "sample")

adiv %>% filter(sample %in% t0) %>%
  group_by(groundwater) %>% 
  summarise(mean = round(mean(shannon), digits = 2), sd = round(sd(shannon), digits = 2), n = n())
# A tibble: 3 × 4
  groundwater  mean    sd     n
  <chr>       <dbl> <dbl> <int>
1 KR0015       5.66  0.62     6
2 SA1420       3.27  1.07     6
3 SA2600       3.03  0.44     6
Expand code
p1 <- adiv %>%
  filter(fraction != "control") %>%
  mutate(fraction = factor(fraction, levels = c("environmental","fractionated","non-fractionated"))) %>%
  ggplot(aes(
    x = groundwater,
    y = shannon,
    fill = fraction
    )) +
  geom_boxplot( outlier.colour = "white") +
  labs(x = "", y = "Alpha diversity (Shannon H index)", fill = "") +
  theme_tidy(legend = "inside") +
  scale_x_discrete(labels = c("Meteoric", "Marine", "Saline")) + 
  scale_fill_manual(values = cols.fraction, labels = c("Environmental", "0.1 - 0.45 µm", "All cells")) +
  theme( 
    legend.margin = margin(),
    legend.background = element_rect(fill = "NA"),
    legend.position.inside = c(0.8,0.88)
  )
Expand code
set.seed(999)

nmds <- seqtab %>%
  filter(sample %in% paste("P21015_10", seq(42,81), sep = "") |
           sample %in% c("P15009_1057","P15009_1059","P15009_1060","P21015_1020","P21015_1021","P21015_1022")
           ) %>%
  select(seqid, sample, count) %>% spread(seqid, count, fill = 0) %>%
  column_to_rownames("sample") %>%
  vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.1203684 
Run 1 stress 0.1448066 
Run 2 stress 0.1374292 
Run 3 stress 0.1212802 
Run 4 stress 0.1331208 
Run 5 stress 0.1230642 
Run 6 stress 0.13485 
Run 7 stress 0.14842 
Run 8 stress 0.133246 
Run 9 stress 0.1408211 
Run 10 stress 0.1212186 
Run 11 stress 0.1440819 
Run 12 stress 0.1336567 
Run 13 stress 0.1400868 
Run 14 stress 0.1447862 
Run 15 stress 0.1504146 
Run 16 stress 0.1212186 
Run 17 stress 0.1342165 
Run 18 stress 0.1305863 
Run 19 stress 0.133246 
Run 20 stress 0.1476626 
*** Best solution was not repeated -- monoMDS stopping criteria:
    14: stress ratio > sratmax
     6: scale factor of the gradient < sfgrmin
Expand code
nmds.scores <- vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  inner_join(smd, by = "sample")
Expand code
p2 <- nmds.scores %>%
  mutate(medium = factor(medium, levels = c("ace","lys","env"))) %>%
  ggplot(aes(x = NMDS1, y = NMDS2, 
             color = fraction, 
             shape = medium,
             fill = fraction
             )
         ) +
  geom_vline(xintercept = 0, linetype = 'dotted') +
  geom_hline(yintercept = 0, linetype = 'dotted') +
  geom_point(size = 4.0, stroke = 0.75) + 
  geom_text(aes(label = age), size = 7/.pt, color = "black", na.rm = TRUE) + 
  scale_shape_manual(values = c("ace" = 23, "lys" = 24, "env" = 21), 
                     labels = c("Acetate", "Lysate", "Environmental")) +
  scale_fill_manual(values = alpha(cols.fraction, alpha = 0.2), labels = c("Environmental", "0.1 - 0.45 µm", "All cells")) +
  scale_color_manual(values = cols.fraction, labels = c("Environmental", "0.1 - 0.45 µm", "All cells")) +
  annotate('text', x = Inf, y = -Inf, size = 7/.pt, hjust = 1.05, vjust = -0.5, 
           label = paste('Stress = ', round(nmds$stress, digits = 2))) +
  theme_tidy(legend = "bottom") +
  theme(
    legend.title = element_blank(),
    legend.box = "vertical",
    legend.spacing.x = unit(0, "mm"),
    legend.box.margin = margin(),
    legend.margin = margin(),
    legend.text = element_text(margin = margin(l = 2, r = 8, unit = "pt")),
    legend.key.spacing.x = unit(0, "mm")
    )
Expand code
library(patchwork)


p1 + p2 + plot_annotation(tag_levels = c("a", "b")) & theme(
  legend.key.size = unit(4.5, "mm"),
  legend.title = element_blank(),
  legend.margin = margin(),
  panel.border = element_rect(fill = "transparent", linewidth = 0.5),
  plot.tag.position = c(0.15, 1.05)
  )

Expand code
ggsave("figures/fig-3.pdf", width = 14, height = 9, units = "cm")

Microscopy

Expand code
growth <- read_tsv("data/micro_cultures.tsv", col_types = cols(.default = col_character(), 
                                                  count = col_integer(),
                                                  date = col_date(format = "%d-%m-%Y"))
                   ) %>%
  mutate(week = case_when(
    date == "2021-10-15" ~  1,
    date == "2021-11-11" ~  5,
    date == "2021-11-25" ~  7,
    date == "2021-12-09" ~  9,
    date == "2021-12-30" ~ 12,
    date == "2022-01-18" ~ 15,
    date == "2022-02-03" ~ 17
  )) %>%
  filter(!sample %in% c("P26201_1051","P26201_1024")) %>%
  mutate(label = case_when(
    groundwater == "KR0015" ~  "Meteoric",
    groundwater == "SA1420" ~  "Marine",
    groundwater == "SA2600" ~  "Saline"
  ))

counts <- tibble(
  label = c("Meteoric","Meteoric","Marine","Marine","Saline","Saline"),
  n = c(680220, 890910, 59220, 73910, 14100, 23000)
) %>% group_by(label) %>% summarise(mean = mean(n), sd = sd(n))
Expand code
# Microscopy counts on KR0015 groundwater in duplicates, obtaining 680,220 and 890,910 cells ml-1.
# Microscopy counts on SA1420 groundwater in duplicates, obtaining 59220 and 73910 cells ml-1.
# For SA2600, this number was 14,100 and 23,000 cells ml-1

growth %>%
  ggplot(aes(x = week, y = count, colour = fraction)) +
  geom_point() +
  geom_line(aes(linetype = medium)) + facet_wrap(~ label, ncol = 3) +
  geom_hline(data = counts, aes(yintercept = mean)) +
  scale_y_log10(limits = c(1e4,1e8),
                breaks = c(1e4,1e5,1e6,1e7),
                labels = c(expression("10"^"4"),
                           expression("10"^"5"),
                           expression("10"^"6"),
                           expression("10"^"7"))
                ) +
  scale_x_continuous(breaks = c(1,3,5,7,9,11,13,15,17)) +
  scale_colour_manual(values = c("unfiltered" = "#1B5331", "filtered" = "#A2A475"),
                      labels = c("0.1 - 0.45 µm","All cells")) +
  scale_linetype(name = "Medium:", labels = c("Acetate", "Cell lysate")) +
  labs(x = "", y = expression("Cell number (mL"^"-1"*")"), colour = "Fraction:") +
  theme_tidy(legend = "bottom") + 
  theme(
    legend.margin = margin(t = -10),
    plot.margin = margin(),
    panel.border = element_blank(),
    axis.line = element_line(linewidth = 0.5, colour = "black")
    )
Figure 3: Cell counts of enrichment cultures according to fluorescence microscopy. Each line represents the cell density of a single culture and the black horizontal line denotes the mean cell count of the groundwater at t0 (n = 2).
Expand code
ggsave(filename = "figures/fig-s2.pdf", width = 18, height = 9, units = "cm")

QPCR KR0015

Expand code
qpcr <- read_tsv('data/qpcr.tsv', col_types = cols(.default = col_character(), 
                                      quantity = col_double(),
                                      age = col_factor(levels = seq(1:10) %>% as.character())
                                      )
         ) %>% mutate(medium = gsub("ace","srm", medium))
Expand code
label <- c(
  'fractionated' = '0.1 - 0.45 µm fraction',
  'non-fractionated' = 'All cells'
)

qpcr %>%
  group_by(sample, Kingdom) %>% 
  summarise(mean = mean(quantity), sd = sd(quantity), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>%
  mutate(category = paste(medium, Kingdom)) %>%
  # Call plot
  ggplot(aes(x = fct_relevel(age, seq(1:10) %>% as.character), 
             y = mean, 
             group = category, 
             colour = fct_relevel(Kingdom, c("Bacteria","Archaea"))
             )
         ) +
  geom_line(aes(linetype = medium)) +
  facet_wrap(~ fraction, labeller = as_labeller(label)) +
  geom_pointrange(aes(ymin = mean - sd, 
                      ymax = mean + sd), 
                  size = 0.2) +
  labs(x = 'Week', y = expression("Gene copy number (mL"^"-1"*")")) +
  scale_y_log10(breaks = c(1e3,1e4,1e5,1e6,1e7), 
                labels = c(expression("10"^"3"),
                           expression("10"^"4"),
                           expression("10"^"5"),
                           expression("10"^"6"),
                           expression("10"^"7")
                           )
                ) +
  scale_linetype(name = "Medium:", labels = c("Acetate", "Cell lysate")) +
  scale_color_manual(name = "", values = c("Bacteria" = "#1B5331","Archaea" = "#A2A475")) +
  theme_tidy(legend = "bottom") +
  theme(
    plot.margin = margin(),
    legend.text = element_text(margin = margin(l = 0)),
    axis.ticks.length = unit(0.5, "mm"),
    panel.border = element_rect(fill = "transparent", linewidth = 0.75),
    strip.background = element_blank(),
    legend.box.margin = margin(-8),
    )

Expand code
ggsave(filename = "figures/fig-2.pdf", width = 12, height = 8, units = "cm")

Areaplot KR0015

Expand code
t <- seqtab %>%
  filter(sample %in% paste("P21015", seq(1042,1081), sep = "_")
           ) %>%
  inner_join(tax, by = "seqid") %>%
  group_by(phylum, sample) %>%
  # Sum the abundance of each phylum within a sample
  summarise(relab = sum(relab), .groups = 'drop_last') %>%
  # Calculate the mean abundance of each phylum over the categories
  summarise(mean_relab = sum(relab), .groups = 'drop') %>%
  filter(!is.na(phylum)) %>%
  top_n(9, mean_relab) %>% filter(phylum != "Bacteroidota")

# The most abundant phyla, averaged over all cultures
t %>% arrange(desc(mean_relab))
# A tibble: 8 × 2
  phylum           mean_relab
  <chr>                 <dbl>
1 Pseudomonadota       10.8  
2 Acidobacteriota       7.85 
3 Spirochaetota         6.28 
4 Patescibacteria       6.15 
5 Desulfobacterota      3.27 
6 Bacillota             1.55 
7 Nanoarchaeota         0.969
8 Omnitrophota          0.853
Expand code
taxref <- tax %>%
  left_join(t %>% transmute(phylum, topphylum = phylum), by = "phylum") %>%
  replace_na(list("topphylum" = "Other"))

cols.phylum <- c(
  "Pseudomonadota" = "#972D15",
  "Acidobacteriota" = "#FAEFD1",
  "Spirochaetota" = "#D8B70A",
  "Patescibacteria" = "#C7B19C",
  "Desulfobacterota" = "#A2A475",
  "Bacillota" = "#1B5331",
  "Nanoarchaeota" = "#046C9A",
  "Omnitrophota" = "#00A08A",
  "Other" = "#899DA4"
)
Expand code
seqtab %>%
  filter(sample %in% paste("P21015", seq(1042,1081), sep = "_")
           ) %>%
  inner_join(taxref, by = "seqid") %>%
  # Summarize in order to have the sum for each category and topphylum
  group_by(sample, topphylum) %>% 
  summarise(relab = sum(relab), .groups = 'drop') %>%
  inner_join(smd, by = "sample") %>%
  mutate(age = as.numeric(.$age)) %>% 
  mutate(fraction = case_when(
    fraction == "non-fractionated" & medium == "ace" ~ "Acetate, all cells",
    fraction == "non-fractionated" & medium == "lys" ~ "Lysate, all cells",
    fraction == "fractionated" & medium == "ace" ~ "Acetate, < 0.45 µm fraction",
    fraction == "fractionated" & medium == "lys" ~ "Lysate, < 0.45 µm fraction"
  )) %>%
  mutate(topphylum = factor(topphylum, levels = rev(names(cols.phylum)))) %>%
  # Call the plot
  ggplot(aes(
    age, 
    relab * 100, 
    fill = topphylum)) +
  geom_col() + 
  scale_x_continuous(n.breaks = 10) +
  facet_wrap(~ fraction, nrow = 1) +
  labs(x = 'Week', y = 'Relative abundance (%)', fill = '') +
  scale_fill_manual(values = cols.phylum) + 
  guides(fill = guide_legend(reverse = TRUE, ncol = 3)) +
  theme_tidy(ratio = 1.2, legend = "bottom") + 
  theme(
    panel.border = element_rect(fill = "transparent", linewidth = 0.5),
    strip.background = element_blank(), 
    legend.background = element_blank(),
    legend.key.height = unit(4, "mm"),
    legend.key.spacing.y = unit(0, "mm"),
    plot.margin = margin(),
    axis.ticks.length = unit(0.5, "mm"),
    legend.box.margin = margin(t = -5)
    )

Expand code
ggsave("figures/fig-s3.png", height = 9, width = 18, units = "cm")

Genome-resolved metagenomics

Basic stats on the MAGs

Expand code
t <- c("Bacillota","Desulfobacterota","Pseudomonadota","Patescibacteria","Nanoarchaeota","Other")

# Prepare data for plot
i <- quality %>%
  inner_join(bintax, by = "genome") %>%
  mutate(phylum = ifelse(phylum %in% t, phylum, "Other")) %>%
  mutate(phylum = factor(phylum, levels = t)) %>%
  select(genome, phylum, completeness, gc_content, genome_size, coding_density) %>%
  distinct() %>%
  mutate(genome.size = genome_size / 1e6)
Expand code
lm(gc_content ~ genome_size, data = quality) %>% summary()

Call:
lm(formula = gc_content ~ genome_size, data = quality)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.15655 -0.06232 -0.02604  0.07979  0.13851 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 3.058e-01  3.670e-02   8.332 1.26e-09 ***
genome_size 5.080e-08  1.029e-08   4.935 2.23e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.09515 on 33 degrees of freedom
Multiple R-squared:  0.4246,    Adjusted R-squared:  0.4072 
F-statistic: 24.36 on 1 and 33 DF,  p-value: 2.233e-05
Expand code
p1 <- i %>%
  ggplot(aes(
    genome.size, 
    gc_content, 
    color = phylum,
    fill = phylum
    )) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#d9d9d9", se = TRUE, linewidth = 0.4) +
  geom_point(size = 2.5, shape = 21, stroke = 0.5) + 
  labs(x = "Estimated genome size (Mbp)", y = "Estimated GC content (%)", color = "") +
  scale_colour_manual(values = cols.mag) +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), guide = "none") +
  theme_tidy() + 
  scale_x_continuous(breaks = c(1,2,3,4,5,6)) +
  annotate("text", x = Inf, y = -Inf, 
           label = "paste(R ^ 2, \" = 0.41\")", 
           size = 7/.pt, vjust = -0.5, hjust = 1, parse = TRUE) +
  theme(
    axis.line = element_line(),
    panel.border = element_blank()
  )
Expand code
p2 <- i %>%
  ggplot(aes(
    phylum, 
    completeness, 
    group = phylum, 
    fill = phylum, 
    color = phylum
    )) +
  geom_boxplot(linewidth = 0.5, outlier.colour = "white") +
  geom_jitter(width = 0.2, size = 2, shape = 21, stroke = 0.5) +
  labs(x = "", y = "Estimated completeness (%)", fill = "") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), guide = "none") + 
  scale_color_manual(values = cols.mag, guide = "none") +
  theme_tidy(legend = "bottom") +
  theme(
    axis.line = element_line(),
    panel.border = element_blank(),
    axis.text.x = element_blank(),
    axis.ticks.x = element_blank(),
    axis.title.y = element_text(margin = margin()),
    plot.margin = margin()
  )
Expand code
lm(coding_density ~ genome_size, data = quality) %>% summary()

Call:
lm(formula = coding_density ~ genome_size, data = quality)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.05556 -0.01303 -0.00046  0.01275  0.03787 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.172e-01  8.112e-03 113.064  < 2e-16 ***
genome_size -6.991e-09  2.275e-09  -3.072  0.00424 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.02103 on 33 degrees of freedom
Multiple R-squared:  0.2224,    Adjusted R-squared:  0.1989 
F-statistic: 9.439 on 1 and 33 DF,  p-value: 0.004236
Expand code
p3 <- i %>%
  ggplot(aes(
    genome.size, 
    coding_density, 
    color = phylum,
    fill = phylum
    )) +
  geom_smooth(method = "lm", formula = y ~ x, colour = "black", fill = "#d9d9d9", se = TRUE, linewidth = 0.4) +
  geom_point(size = 2.5, shape = 21, stroke = 0.5) + 
  labs(x = "Estimated genome size (Mbp)", y = "Estimated coding density", color = "") +
  scale_colour_manual(values = cols.mag, guide = "none") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), guide = "none") +
  theme_tidy() + 
  scale_x_continuous(breaks = c(1,2,3,4,5,6)) +
  scale_y_continuous(n.breaks = 7) +
  annotate("text", x = Inf, y = Inf, 
           label = "paste(R ^ 2, \" = 0.20\")", 
           size = 7/.pt, vjust = 1, hjust = 1, parse = TRUE) +
  theme(
    axis.line = element_line(),
    panel.border = element_blank()
  )
Expand code
nmds <- coverm %>%
  # Sum the abundance of both coverage per metagenome
  mutate(abundance = 1) %>% 
  inner_join(emapper, by = "genome", relationship = "many-to-many") %>%
  # Filter out NA and add multiple KOs on new line
  filter(KEGG_ko != "-") %>% separate_rows(KEGG_ko, sep = ",") %>%
  # Sum the abundance of each KO within the metagenomes
  group_by(genome, KEGG_ko) %>% 
  summarise(abundance = sum(abundance), .groups = "drop") %>%
  # Pivot to prepare a numerical matrix
  pivot_wider(names_from = "KEGG_ko", values_from = "abundance", values_fill = 0) %>%
  column_to_rownames("genome") %>%
  vegan::metaMDS()
Square root transformation
Wisconsin double standardization
Run 0 stress 0.06277184 
Run 1 stress 0.06384113 
Run 2 stress 0.06856721 
Run 3 stress 0.06286035 
... Procrustes: rmse 0.07810586  max resid 0.1485213 
Run 4 stress 0.06368732 
Run 5 stress 0.06286033 
... Procrustes: rmse 0.07811342  max resid 0.1485397 
Run 6 stress 0.06362823 
Run 7 stress 0.06902555 
Run 8 stress 0.06610897 
Run 9 stress 0.0679945 
Run 10 stress 0.06384089 
Run 11 stress 0.06299223 
... Procrustes: rmse 0.07837556  max resid 0.1496081 
Run 12 stress 0.06299208 
... Procrustes: rmse 0.07823435  max resid 0.1492526 
Run 13 stress 0.06386464 
Run 14 stress 0.07029334 
Run 15 stress 0.06796592 
Run 16 stress 0.06500024 
Run 17 stress 0.1937571 
Run 18 stress 0.06459237 
Run 19 stress 0.06660911 
Run 20 stress 0.06299231 
... Procrustes: rmse 0.0783978  max resid 0.1496653 
*** Best solution was not repeated -- monoMDS stopping criteria:
     2: no. of iterations >= maxit
    18: stress ratio > sratmax
Expand code
nmds.scores <- vegan::scores(nmds)$sites %>%
  as.data.frame() %>%
  rownames_to_column("genome")
Expand code
p4 <- nmds.scores %>%
  inner_join(bintax, by = "genome") %>%
  mutate(phylum = ifelse(phylum %in% t, phylum, "Other")) %>%
  # Plot
  ggplot(aes(x = NMDS1, 
             y = NMDS2, 
             colour = fct_relevel(phylum,t),
             fill = phylum
             )) +
  geom_vline(xintercept = 0, linetype = 'dotted', linewidth = 0.5) +
  geom_hline(yintercept = 0, linetype = 'dotted', linewidth = 0.5) +
  # Points for samples, coloured by nature
  geom_point(size = 3, shape = 21, stroke = 0.5) +
  theme_tidy() +
  scale_color_manual(values = cols.mag, name = "Phylum", guide = "none") +
  scale_fill_manual(values = alpha(cols.mag, alpha = 0.5), name = "", guide = "none") +
  scale_y_continuous(breaks = c(-0.5, 0, 0.5)) + 
  annotate('text', x = Inf, y = -Inf, size = 7/.pt,
           label = paste('Stress = ', round(nmds$stress, digits = 2)),
           hjust = 1.05, vjust = -0.5) +
  theme(
    axis.title.y = element_text(margin = margin()),
    legend.spacing.x = unit(-0.5, "mm")
    )
Expand code
library(patchwork)

p1 + p2 + p3 + p4 + plot_layout(guides = "collect", nrow = 2) & plot_annotation(tag_levels = c("a", "b", "c", "d")) &
  theme(
    plot.tag = element_text(size = 7),
    plot.tag.position = c(0.1, 1.03),
    legend.position = "bottom",
    axis.ticks.length = unit(".75", "mm"),
    legend.box.margin = margin(),
    legend.text = element_text(hjust = 0),
    legend.key.size = unit("3", "mm")
    )

Expand code
ggsave(filename = "figures/fig-4.pdf", width = 12, height = 14, units = "cm")

Metabolic weight

Expand code
samples <- coverm$sample %>% unique()

mw <- data.frame()
for (file in list.files("data/mw", full.names = TRUE)) {
  name <- substr(file, 19, nchar(file)) %>% gsub(".txt", "", .)
  name <- paste("S", name, sep = "")
  name <- samples[grepl(name, samples)]
  i <- read_tsv(file, show_col_types = F, comment = "##") %>% 
    pivot_longer(3:ncol(.), names_to = "genome", values_to = "mw") %>%
    mutate(sample = name) %>%
    filter(mw > 0)
  i <- i %>% rename(pathway = 1, mw.function = 2) %>% select(sample, genome, pathway, mw.function, mw) %>% mutate(pathway = substr(.$pathway, 8, nchar(.)))
  mw <- mw %>% rbind(i)
}
Expand code
i <- list(sample = c("VK-3516-l3u_S16", "VK-3516-l3u_S16"),
          genome = c("MEGAHIT-MaxBin2Refined-VK-3516-KFM02a-3_S40_L002.026", "MEGAHIT-MetaBAT2Refined-VK-3516-KFM02a-3_S40_L002.49_sub"),
          pathway = c("Organic carbon oxidation - complex carbon degradation", "Organic carbon oxidation - complex carbon degradation"), 
          mw.function = c(0, 0), 
          mw = c(0, 0))

mw <- mw %>% 
  # Some MAGs do not have any metabolic weight: assign a dummy
  rbind(i) %>%
  mutate(label = case_when(
  pathway == "Organic carbon oxidation - complex carbon degradation" ~ "01: Organic C oxidation: complex C",
  pathway == "Organic carbon oxidation - amino acid utilization" ~ "02: Organic C oxidation: AA utilization",
  pathway == "Organic carbon oxidation - aromatics degradation" ~ "03: Organic C oxidation: aromatics",
  pathway == "Carbon fixation - CBB cycle (Rubisco)" ~ "04: C fixation: CBB cycle",
  pathway == "Carbon fixation - Reverse TCA cycle" ~ "05: C fixation: Reverse TCA",
  pathway == "Carbon fixation - 3HP/4HB" ~ "06: C fixation: 3HP/4HB",
  pathway == "Carbon fixation - Wood-Ljungdahl pathway" ~ "07: C fixation: Wood-Ljungdahl",
  pathway == "Acetate oxidation" ~ "08: Acetate oxidation",
  pathway == "Ethanol oxidation" ~ "09: Ethanol oxidation",
  pathway == "Fermentation" ~ "10: Fermentation",
  pathway == "Hydrogen generation" ~ "11: Hydrogen generation",
  pathway == "Hydrogen oxidation" ~ "12: Hydrogen oxidation",
  pathway == "N2 fixation - nifDK||vnfDKG||nifH" ~ "13: N fixation: nifDK, nifH",
  pathway == "Nitrate reduction - napAB" ~ "13: Nitrate reduction: napAB",
  pathway == "Nitrite ammonification - nirBD" ~ "14: Nitrite ammonification: nirBD",
  pathway == "Nitrite ammonification - nrfADH" ~ "15: Nitrite ammonification: nrfADH",
  pathway == "Sulfur oxidation - sdo" ~ "16: Sulfur oxidation: sdo",
  pathway == "Sulfate reduction" ~ "17: Sulfate reduction",
  pathway == "Sulfide oxidation - sqr" ~ "18: Sulfide oxidation: sqr",
  .default = "19: Other"
)) %>%
  mutate(order = substr(label, 1, 2)) %>%
  mutate(label = substr(label, 5, nchar(.))) %>%
  arrange(order)   


cols.phylum <- c(
  "Desulfobacterota" = "#A2A475",
  "Pseudomonadota" = "#972D15",
  "Bacillota" = "#1B5331",
  "Patescibacteria" = "#C7B19C",
  "Nanoarchaeota" = "#00A08A",  
  "Chloroflexota" = "#D69C4E",
  "Spirochaetota" = "#D8B70A",
  "Campylobacterota" = "#FAEFD1",
  "Iainarchaeota" = "#046C9A"
)
Expand code
name <- bintax %>%
  filter(phylum %in% names(cols.phylum)) %>%
  mutate(phylum = factor(phylum, levels = names(cols.phylum))) %>%
  arrange(phylum) %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  pull(taxon) %>% unique()

p1 <- bintax %>%
  # Use subset of bins
  filter(phylum %in% names(cols.phylum)) %>%
  mutate(taxonomy = "Taxonomy") %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  # Plot
  ggplot(aes(fct_relevel(taxon, name), 
             taxonomy, 
             fill = fct_relevel(phylum, names(cols.phylum))
             )) +
  geom_tile(colour = "white") +
  labs(x = "", y = "", fill = "") +
  scale_fill_manual(values = cols.phylum) +
  scale_x_discrete(position = "top") + 
  theme_tidy(ratio = NULL) +
  coord_equal() +
  theme(
    legend.key.height = unit(3, "mm"),
    legend.key.width = unit(3, "mm"),
    axis.ticks.x = element_blank(),
    axis.ticks.y = element_blank(),
    axis.text.x = element_text(angle = 90, hjust = 0),
    axis.text.y = element_blank(),
    panel.border = element_rect(linewidth = 1,fill = NA)
  )
Expand code
i <- mw %>%
  filter(label != "Other") %>%
  pull(label) %>% unique()

p2 <- mw %>%
  group_by(genome, label) %>% summarise(mw = mean(mw), .groups = "drop") %>%
  inner_join(bintax, by = "genome") %>% filter(phylum %in% names(cols.phylum)) %>%
  mutate(genus = case_match(genus, NA ~ family, .default = genus)) %>%
  mutate(taxon = if_else(is.na(species), paste(genus, "sp."), species)) %>%
  # Order the genomes
  mutate(taxon = factor(taxon, name)) %>%
  # Order the pathways
  mutate(label = factor(label, levels = rev(i))) %>%
  # Filter out pathways
  filter(!is.na(label)) %>%
  # Plot
  ggplot(aes(
    taxon, 
    label
    )) +
  geom_point(aes(size = mw), shape = 21, stroke = 0.5, fill = "white") +
  theme_tidy() +
  geom_vline(xintercept = 7.5, linewidth = 0.2) +
  geom_vline(xintercept = 13.5, linewidth = 0.2) +
  geom_vline(xintercept = 19.5, linewidth = 0.2) +
  geom_vline(xintercept = 22.5, linewidth = 0.2) +
  geom_vline(xintercept = 24.5, linewidth = 0.2) +
  labs(x = "", y = "", size = "Metabolic\nweight score") +
  theme(
    legend.spacing.x = unit(0, "mm"),
    legend.box.margin = margin(),
    panel.border = element_rect(linewidth = 1, fill = NA),
    axis.ticks = element_blank(),
    axis.text.x = element_blank(),
    panel.grid.major = element_blank()
  )
Expand code
library(patchwork)

p1 / p2 & theme(
  legend.box.margin = margin(l = -8),
  plot.margin = margin(b = 2)
  )

Expand code
ggsave("figures/fig-5.pdf", width = 18, height = 18, units = "cm")